heartdata <- data.frame(read.csv("heart_2020_cleaned.csv"))
n=10
#heartdata <- heartdata[seq(1,nrow(heartdata),n),]
heartdata$HeartDisease <- as.factor(heartdata$HeartDisease)
heartdata$Smoking <- as.factor(heartdata$Smoking)
heartdata$AlcoholDrinking <- as.factor(heartdata$AlcoholDrinking)
heartdata$Stroke <- as.factor(heartdata$Stroke)
heartdata$DiffWalking <- as.factor(heartdata$DiffWalking)
heartdata$Sex <- as.factor(heartdata$Sex)
heartdata$Race <- gsub("American Indian/Alaskan Native","Native",heartdata$Race)
heartdata$Race <- factor(heartdata$Race,levels=c("White","Hispanic","Black","Asian","Native","Other"))
heartdata$Diabetic <- gsub("No, borderline diabetes","Borderline",heartdata$Diabetic)
heartdata$Diabetic <- factor(heartdata$Diabetic,levels=c("No","Borderline","Yes (during pregnancy)","Yes"))
heartdata$PhysicalActivity <- as.factor(heartdata$PhysicalActivity)
heartdata$GenHealth <- factor(heartdata$GenHealth,levels=c("Poor","Fair","Good","Very good","Excellent"))
heartdata$Asthma <- as.factor(heartdata$Asthma)
heartdata$KidneyDisease <- as.factor(heartdata$KidneyDisease)
heartdata$SkinCancer <- as.factor(heartdata$SkinCancer)
heartdata$AgeCategory <- gsub(" or older","-84",as.character(heartdata$AgeCategory))
heartdata$HiAge <- as.numeric(substr(heartdata$AgeCategory,4,5))
heartdata$rand <- sample(0:4,size=nrow(heartdata),replace=T)
heartdata$Age <- with(heartdata,HiAge-rand)
#heartdata <- subset(heartdata,select=-c(AgeCategory,HiAge,rand))
posdis <- subset(heartdata,HeartDisease=="Yes")
negdis <- subset(heartdata,HeartDisease=="No")
str(heartdata)
# My functions
GeomSplitViolin <- ggproto("GeomSplitViolin", GeomViolin, draw_group = function(self, data, ..., draw_quantiles = NULL){
data <- transform(data, xminv = x - violinwidth * (x - xmin), xmaxv = x + violinwidth * (xmax - x))
grp <- data[1,'group']
newdata <- plyr::arrange(transform(data, x = if(grp%%2==1) xminv else xmaxv), if(grp%%2==1) y else -y)
newdata <- rbind(newdata[1, ], newdata, newdata[nrow(newdata), ], newdata[1, ])
newdata[c(1,nrow(newdata)-1,nrow(newdata)), 'x'] <- round(newdata[1, 'x'])
if (length(draw_quantiles) > 0 & !scales::zero_range(range(data$y))) {
stopifnot(all(draw_quantiles >= 0), all(draw_quantiles <=
1))
quantiles <- create_quantile_segment_frame(data, draw_quantiles)
aesthetics <- data[rep(1, nrow(quantiles)), setdiff(names(data), c("x", "y")), drop = FALSE]
aesthetics$alpha <- rep(1, nrow(quantiles))
both <- cbind(quantiles, aesthetics)
quantile_grob <- GeomPath$draw_panel(both, ...)
ggplot2:::ggname("geom_split_violin", grobTree(GeomPolygon$draw_panel(newdata, ...), quantile_grob))
}
else {
ggplot2:::ggname("geom_split_violin", GeomPolygon$draw_panel(newdata, ...))
}
})
geom_split_violin <- function (mapping = NULL, data = NULL, stat = "ydensity", position = "identity", ..., draw_quantiles = NULL, trim = TRUE, scale = "area", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) {
layer(data = data, mapping = mapping, stat = stat, geom = GeomSplitViolin, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list(trim = trim, scale = scale, draw_quantiles = draw_quantiles, na.rm = na.rm, ...))
}
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# Histograms of numerical variables
ggplot(data=heartdata, aes(x=BMI))+geom_histogram(bins=50)+xlim(0,75)+ggtitle("Distribution of BMI")+geom_vline(xintercept = mean(heartdata$BMI,na.rm = TRUE), color = "red", size=1.5)+geom_vline(xintercept = median(heartdata$BMI,na.rm = TRUE), color = "blue", size=1.5)+geom_vline(xintercept = getmode(heartdata$BMI), color = "orange", size=1.5)
ggplot(data=heartdata, aes(x=PhysicalHealth))+geom_histogram(bins=30)
ggplot(data=heartdata, aes(x=MentalHealth))+geom_histogram(bins=30)
ggplot(data=heartdata, aes(x=SleepTime))+geom_histogram(bins=12)+xlim(2,13)+ggtitle("Distribution of Sleep Time")+geom_vline(xintercept = mean(heartdata$SleepTime,na.rm = TRUE), color = "red", size=1.5)+geom_vline(xintercept = median(heartdata$SleepTime,na.rm = TRUE), color = "blue", size=1.5)+geom_vline(xintercept = getmode(heartdata$SleepTime), color = "orange", size=1.5)
ggplot(data=heartdata, aes(x=Age))+geom_histogram(bins=65)+ggtitle("Distribution of Ages")+geom_vline(xintercept = mean(heartdata$Age,na.rm = TRUE), color = "red", size=1.5)+geom_vline(xintercept = median(heartdata$Age,na.rm = TRUE), color = "blue", size=1.5)+geom_vline(xintercept = getmode(heartdata$Age), color = "orange", size=1.5)
# Comparison of factors
print("Heart disease:")
hea<-table(heartdata$HeartDisease)
hea
#No heart disease
hea[1]/(hea[1]+hea[2])
#Heart disease
hea[2]/(hea[1]+hea[2])
print("Smoking and heart disease:")
smo<-table(heartdata$HeartDisease,heartdata$Smoking)
smo
#No smoking, no heart disease
smo[1]
smo[1]/(smo[1]+smo[2])
#No smoking, heart disease
smo[2]
smo[2]/(smo[1]+smo[2])
#Smoking, no heart disease
smo[3]
smo[3]/(smo[3]+smo[4])
#Smoking, heart disease
smo[4]
smo[4]/(smo[3]+smo[4])
print("Drinking and heart disease:")
dri<-table(heartdata$HeartDisease,heartdata$AlcoholDrinking)
dri
#No drinking, no heart disease
dri[1]
dri[1]/(dri[1]+dri[2])
#No drinking, heart disease
dri[2]
dri[2]/(dri[1]+dri[2])
#Drinking, no heart disease
dri[3]
dri[3]/(dri[3]+dri[4])
#Drinking, heart disease
dri[4]
dri[4]/(dri[3]+dri[4])
print("Diabetes and heart disease")
dia<-table(heartdata$Diabetic,heartdata$HeartDisease)
dia
print("Stroke and heart disease")
stro<-table(heartdata$Stroke,heartdata$HeartDisease)
stro
print("Walking and heart disease")
wal<-table(heartdata$DiffWalking,heartdata$HeartDisease)
wal
print("Physical activity and heart disease")
phy<-table(heartdata$PhysicalActivity,heartdata$HeartDisease)
phy
print("Asthma and heart disease")
ast<-table(heartdata$Asthma,heartdata$HeartDisease)
ast
print("Kidney disease and heart disease")
kid<-table(heartdata$KidneyDisease,heartdata$HeartDisease)
kid
print("Skin cancer and heart disease")
ski<-table(heartdata$SkinCancer,heartdata$HeartDisease)
ski
# Heart disease and Age
ggplot(heartdata, aes(x=HeartDisease, y=Age, fill=HeartDisease)) + geom_boxplot(outlier.shape = NA)+ggtitle("Heart Disease by Age")+coord_flip()
agelist <- sort(unique(heartdata$Age))
Age <- c()
PositiveMale <- c()
PositiveFemale <- c()
TotalMale <- c()
TotalFemale <- c()
Male <- c()
Female <- c()
Sex <- c()
Positive <- c()
Total <- c()
for (x in 1:length(agelist)){
Age <- c(Age,agelist[x])
PositiveMale <- c(PositiveMale, nrow(subset(heartdata,Age==agelist[x] & HeartDisease=="Yes" & Sex=="Male")))
PositiveFemale <- c(PositiveFemale, nrow(subset(heartdata,Age==agelist[x] & HeartDisease=="Yes" & Sex=="Female")))
TotalMale <- c(TotalMale, nrow(subset(heartdata,Age==agelist[x] & Sex=="Male")))
TotalFemale <- c(TotalFemale, nrow(subset(heartdata,Age==agelist[x] & Sex=="Female")))
Male <- c(Male,"Male")
Female <- c(Female, "Female")
}
Sex <- c(Male,Female)
Positive <- c(PositiveMale,PositiveFemale)
Total <- c(TotalMale,TotalFemale)
agedata <- data.frame(Age, Positive, Total, Sex)
agedata['Percentage'] <- 100*Positive/Total
agedata$Sex <- factor(agedata$Sex,levels=c("Male","Female"))
ggplot(agedata, aes(x=Age,y=Percentage,fill=Sex)) + geom_area(position="identity") +scale_fill_manual(values=c("blue","magenta")) + ggtitle("Heart Disease by Age and Sex")
#ggplot(subset(agedata,Sex=="Female"), aes(x=Age,y=Percentage)) + geom_area()
# Heart disease and BMI
ggplot(heartdata, aes(x=HeartDisease, y=BMI, fill=HeartDisease)) + geom_boxplot(outlier.shape = NA)+ggtitle("Heart Disease by BMI")+ylim(15,45)
# Heart disease versus gen health
ggplot(posdis, aes(x=GenHealth,fill=GenHealth)) + geom_bar()+ggtitle("General Health for People With Heart Disease")
ggplot(negdis, aes(x=GenHealth,fill=GenHealth)) + geom_bar()+ggtitle("General Health for People Without Heart Disease")
ggplot(heartdata, aes(x=GenHealth)) + geom_bar(aes(fill=HeartDisease))+ggtitle("General Health for People With and Without Heart Disease")
ggplot(heartdata,aes(GenHealth,Age,fill=HeartDisease))+geom_split_violin()+ggtitle("Age and General Health for People With and Without Heart Disease")
# BMI and health
ggplot(heartdata, aes(x=GenHealth, y=BMI, fill=GenHealth)) + geom_boxplot(outlier.shape = NA)+ggtitle("General Health by BMI")+ylim(15,45)
# Race
ggplot(heartdata, aes(x=Race)) + geom_bar(aes(fill=HeartDisease))+ggtitle("Race of People With and Without Heart Disease")
ggplot(heartdata,aes(Race,Age,fill=HeartDisease))+geom_split_violin()+ggtitle("Age and Race for People With and Without Heart Disease")
rac <- table(heartdata$HeartDisease,heartdata$Race)
Race <- c("White","Hispanic","Black","Asian","Native","Other")
Percentage <- c(100*rac[2]/(rac[1]+rac[2]),100*rac[4]/(rac[3]+rac[4]),100*rac[6]/(rac[5]+rac[6]),100*rac[8]/(rac[7]+rac[8]),100*rac[10]/(rac[11]+rac[10]),100*rac[12]/(rac[11]+rac[12]))
racedata <- data.frame(Race,Percentage)
racedata$Race <- factor(racedata$Race,levels=c("White","Hispanic","Black","Asian","Native","Other"))
ggplot(racedata,aes(x=Race,y=Percentage,fill=Race))+geom_bar(stat="identity")+ggtitle("Percentage of People With Heart Disease")
# Pie graphs
hear<-table(heartdata$HeartDisease)
pie(hear,col = rainbow(length(hear)))
smok<-table(heartdata$Smoking)
pie(smok,col = rainbow(length(smok)))
drin<-table(heartdata$AlcoholDrinking)
pie(drin,col = rainbow(length(drin)))
race<-table(heartdata$Race)
pie(race,col = rainbow(length(race)))
sexx<-table(heartdata$Sex)
pie(sexx,col = rainbow(length(sexx)))
# Mental health
heartdata2 <- heartdata
heartdata2 <- heartdata2[heartdata2$MentalHealth != 0,]
heartdata2$SleepTime <-ifelse(heartdata2$SleepTime<7,"Too little",ifelse(heartdata2$SleepTime>9,"Too much","Recommended"))
heartdata2$Sleep <- factor(heartdata2$SleepTime,levels=c("Too little","Recommended","Too much"))
#+coord_cartesian(ylim = c(0, 12))
ggplot(heartdata2, aes(x=Smoking, y=MentalHealth, fill=Smoking)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health for Smokers and Non-Smokers")
ggplot(heartdata2, aes(x=AlcoholDrinking, y=MentalHealth, fill=AlcoholDrinking)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health for Drinkers and Non-Drinkers")
ggplot(heartdata2, aes(x=PhysicalActivity, y=MentalHealth, fill=PhysicalActivity)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health versus Physical Activity")
ggplot(heartdata2, aes(x=Sleep, y=MentalHealth, fill=Sleep)) + geom_boxplot(outlier.shape = NA)+ggtitle("Mental Health versus Sleep")
###Chi-Square Test Is heart disease data related to gender??
sextable= table(heartdata$HeartDisease,heartdata$Sex)
ezids::xkabledply(sextable, title="Contingency table for Heart Disease vs Gender ")
| Female | Male | |
|---|---|---|
| No | 156571 | 135851 |
| Yes | 11234 | 16139 |
chitest_sex = chisq.test(sextable)
chitest_sex
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: sextable
## X-squared = 1568, df = 1, p-value <2e-16
Here, P value less than 0.05. That reject null Hypothesis. That’s mean heart disease data and gender are related to each other. Does the data support that race very much affects heart disease???
racetable = xtabs(~ Race+HeartDisease, data =heartdata )
racetable
## HeartDisease
## Race No Yes
## White 222705 22507
## Hispanic 26003 1443
## Black 21210 1729
## Asian 7802 266
## Native 4660 542
## Other 10042 886
chitest_race = chisq.test(racetable)
chitest_race
##
## Pearson's Chi-squared test
##
## data: racetable
## X-squared = 844, df = 5, p-value <2e-16
Here, P value less than 0.05. That reject null Hypothesis. That’s mean data supports race has effect on heart disease.
Does the data support that Heart Disease effect on Gen Health??
gentable= table(heartdata$HeartDisease,heartdata$GenHealth)
ezids::xkabledply(gentable, title="Contingency table for Heart Disease vs Gen Health ")
| Poor | Fair | Good | Very good | Excellent | |
|---|---|---|---|---|---|
| No | 7439 | 27593 | 83571 | 108477 | 65342 |
| Yes | 3850 | 7084 | 9558 | 5381 | 1500 |
chitest_gen = chisq.test(gentable)
chitest_gen
##
## Pearson's Chi-squared test
##
## data: gentable
## X-squared = 21542, df = 4, p-value <2e-16
Here, P value less than 0.05. That reject null Hypothesis. That’s mean data support that Heat Disease effect on Gen Health ###T-test What is the average age people are suffering from heart disease??
heart_disease_on=subset(heartdata,HeartDisease=="Yes")
ttest_age <- t.test(heart_disease_on$Age)
ttest_age
##
## One Sample t-test
##
## data: heart_disease_on$Age
## t = 934, df = 27372, p-value <2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 68.0 68.3
## sample estimates:
## mean of x
## 68.2
Average of heaving heart disease is 68. ###Test For Association
What variables affect mental health physical health? In particular, does alcohol drinking, smoking
menhealth_smoke<-cor.test(heartdata$MentalHealth,as.numeric(heartdata$Smoking), method="pearson")
menhealth_smoke
##
## Pearson's product-moment correlation
##
## data: heartdata$MentalHealth and as.numeric(heartdata$Smoking)
## t = 48, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.0817 0.0886
## sample estimates:
## cor
## 0.0852
menhealth_drinking<-cor.test(heartdata$MentalHealth,as.numeric(heartdata$AlcoholDrinking), method="pearson")
menhealth_drinking
##
## Pearson's product-moment correlation
##
## data: heartdata$MentalHealth and as.numeric(heartdata$AlcoholDrinking)
## t = 29, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.0478 0.0547
## sample estimates:
## cor
## 0.0513
Smoking and drinking variables do not have strong correlation with mental health.Smoking has more stronger correlation than drinking alcohol with mental health.
phyhealth_smoke<-cor.test(heartdata$PhysicalHealth,as.numeric(heartdata$Smoking), method="pearson")
phyhealth_smoke
##
## Pearson's product-moment correlation
##
## data: heartdata$PhysicalHealth and as.numeric(heartdata$Smoking)
## t = 66, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.112 0.119
## sample estimates:
## cor
## 0.115
phyhealth_drinking<-cor.test(heartdata$PhysicalHealth,as.numeric(heartdata$AlcoholDrinking), method="pearson")
phyhealth_drinking
##
## Pearson's product-moment correlation
##
## data: heartdata$PhysicalHealth and as.numeric(heartdata$AlcoholDrinking)
## t = -10, df = 319793, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0207 -0.0138
## sample estimates:
## cor
## -0.0173
Smoking and drinking variables do not have strong correlation with physical health.Smoking has more stronger correlation than drinking alcohol with physical health where drinking alcohol is negatively correlated.
What variables affect instances of heart disease?
Our goal is to find out the people who are likely to have heart disease in the future, so we can take some actions like a more detailed physical examination before the conditions become worse.
heartdata2 <- data.frame(read.csv("heart_2020_cleaned.csv"))
n=10
heartdata2$Smoking <- as.factor(heartdata2$Smoking)
heartdata2$AlcoholDrinking <- as.factor(heartdata2$AlcoholDrinking)
heartdata2$Stroke <- as.factor(heartdata2$Stroke)
heartdata2$DiffWalking <- as.factor(heartdata2$DiffWalking)
heartdata2$Sex <- as.factor(heartdata2$Sex)
heartdata2$Race <- as.factor(heartdata2$Race)
heartdata2[heartdata2$Diabetic == 'Yes (during pregnancy)', "Diabetic"] <- 'Yes'
heartdata2$Diabetic <- as.factor(heartdata2$Diabetic)
heartdata2$PhysicalActivity <- as.factor(heartdata2$PhysicalActivity)
heartdata2[heartdata2$GenHealth == 'Poor', "GenHealth"] <- 0
heartdata2[heartdata2$GenHealth == 'Fair', "GenHealth"] <- 1
heartdata2[heartdata2$GenHealth == 'Good', "GenHealth"] <- 2
heartdata2[heartdata2$GenHealth == 'Very good', "GenHealth"] <- 3
heartdata2[heartdata2$GenHealth == 'Excellent', "GenHealth"] <- 4
#heartdata2$GenHealth <- as.factor(heartdata2$GenHealth)
heartdata2$GenHealth <- as.numeric(heartdata2$GenHealth)
heartdata2$Asthma <- as.factor(heartdata2$Asthma)
heartdata2$KidneyDisease <- as.factor(heartdata2$KidneyDisease)
heartdata2$SkinCancer <- as.factor(heartdata2$SkinCancer)
heartdata2$AgeCategory <- gsub(" or older","-4",as.character(heartdata2$AgeCategory))
heartdata2$LoAge <- as.numeric(substr(heartdata2$AgeCategory,1,2))
heartdata2$rand <- sample(0:4,size=nrow(heartdata2),replace=T)
heartdata2$Age <- with(heartdata2,LoAge+rand)
heartdata2 <- subset(heartdata2,select=-c(AgeCategory,LoAge,rand))
heartdata2$y <- as.factor(heartdata2$HeartDisease)
heartdata2 = subset(heartdata2, select = -c(HeartDisease))
str(heartdata2)
At first, because we need to do the feature selection later, I put the HeartDisease column in the end of the dataset and rename it into y.
To get a look of the proportion of heart disease data before we continue our research.
table( heartdata2$y )
##
## No Yes
## 292422 27373
The dataset is very unbalanced, Considering that the dataset is large, so I use undersampling methods to balance the dataset. Reference: https://www.analyticsvidhya.com/blog/2016/03/practical-guide-deal-imbalanced-classification-problems/
loadPkg("ROSE")
data_balanced_under <- ovun.sample(y ~ ., data = heartdata2, method = "under", N = 27373*2, seed = 1)$data
rm(heartdata2)
unloadPkg("ROSE")
We can find that the data is really balanced.
table( data_balanced_under$y )
##
## No Yes
## 27373 27373
And also the structure of the dataset.
str(data_balanced_under)
## 'data.frame': 54746 obs. of 18 variables:
## $ BMI : num 30.9 21.9 25.8 25.8 30.1 ...
## $ Smoking : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ AlcoholDrinking : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ Stroke : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ PhysicalHealth : num 0 0 1 5 1 0 0 0 3 0 ...
## $ MentalHealth : num 0 0 0 0 0 0 0 3 10 0 ...
## $ DiffWalking : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 1 2 1 1 2 1 1 1 ...
## $ Race : Factor w/ 6 levels "American Indian/Alaskan Native",..: 6 6 6 4 6 6 6 6 6 6 ...
## $ Diabetic : Factor w/ 3 levels "No","No, borderline diabetes",..: 1 1 1 1 1 1 1 1 3 1 ...
## $ PhysicalActivity: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ GenHealth : num 4 4 3 4 3 3 2 3 2 3 ...
## $ SleepTime : num 8 6 8 6 6 8 5 8 6 8 ...
## $ Asthma : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ KidneyDisease : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ SkinCancer : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
## $ Age : num 55 25 81 32 59 66 27 34 63 64 ...
## $ y : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
To train and evaluate the model, we will split the dataset into two parts. 80 percent of the dataset will be used to train the model and the rest will be used to test the accuracy of the model.
loadPkg("caret")
set.seed(4321)
test <- createDataPartition( data_balanced_under$y, p = .2, list = FALSE )
data_train <- data_balanced_under[ -test, ]
data_test <- data_balanced_under[ test, ]
unloadPkg("caret")
model_glm <- glm( y ~ ., data = data_train, family = binomial(logit) )
summary_glm <- summary(model_glm)
summary_glm
##
## Call:
## glm(formula = y ~ ., family = binomial(logit), data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0217 -0.7842 -0.0249 0.8147 2.9765
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.375878 0.144134 -23.42 < 2e-16 ***
## BMI 0.011718 0.001978 5.92 3.2e-09 ***
## SmokingYes 0.390957 0.024277 16.10 < 2e-16 ***
## AlcoholDrinkingYes -0.171383 0.053407 -3.21 0.00133 **
## StrokeYes 1.194871 0.050744 23.55 < 2e-16 ***
## PhysicalHealth 0.003983 0.001548 2.57 0.01008 *
## MentalHealth 0.005539 0.001582 3.50 0.00046 ***
## DiffWalkingYes 0.213718 0.033258 6.43 1.3e-10 ***
## SexMale 0.734255 0.024608 29.84 < 2e-16 ***
## RaceAsian -0.482339 0.133234 -3.62 0.00029 ***
## RaceBlack -0.370447 0.101298 -3.66 0.00026 ***
## RaceHispanic -0.209198 0.102247 -2.05 0.04076 *
## RaceOther -0.173323 0.112167 -1.55 0.12229
## RaceWhite -0.195416 0.091551 -2.13 0.03280 *
## DiabeticNo, borderline diabetes 0.198445 0.074399 2.67 0.00765 **
## DiabeticYes 0.486828 0.030465 15.98 < 2e-16 ***
## PhysicalActivityYes -0.032212 0.028335 -1.14 0.25561
## GenHealth -0.503055 0.014083 -35.72 < 2e-16 ***
## SleepTime -0.030295 0.007678 -3.95 8.0e-05 ***
## AsthmaYes 0.298173 0.034461 8.65 < 2e-16 ***
## KidneyDiseaseYes 0.663836 0.052098 12.74 < 2e-16 ***
## SkinCancerYes 0.145508 0.035878 4.06 5.0e-05 ***
## Age 0.058546 0.000947 61.80 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 60714 on 43795 degrees of freedom
## Residual deviance: 43318 on 43773 degrees of freedom
## AIC: 43364
##
## Number of Fisher Scoring iterations: 5
We can find from the model that the p-value of Race and PhysicalActivity are larger than 0.05, which means they are insignificant. So just drop these two variables and make a logistic regression model again.
model2_glm <- glm( y ~ . - Race - PhysicalActivity, data = data_train, family = binomial(logit) )
summary2_glm <- summary(model2_glm)
summary2_glm
##
## Call:
## glm(formula = y ~ . - Race - PhysicalActivity, family = binomial(logit),
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0166 -0.7839 -0.0258 0.8159 2.9828
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.647232 0.110721 -32.94 < 2e-16 ***
## BMI 0.011943 0.001965 6.08 1.2e-09 ***
## SmokingYes 0.400204 0.024130 16.59 < 2e-16 ***
## AlcoholDrinkingYes -0.165016 0.053373 -3.09 0.00199 **
## StrokeYes 1.189634 0.050649 23.49 < 2e-16 ***
## PhysicalHealth 0.004317 0.001542 2.80 0.00511 **
## MentalHealth 0.005698 0.001580 3.61 0.00031 ***
## DiffWalkingYes 0.216886 0.032854 6.60 4.1e-11 ***
## SexMale 0.735589 0.024534 29.98 < 2e-16 ***
## DiabeticNo, borderline diabetes 0.189406 0.074246 2.55 0.01074 *
## DiabeticYes 0.482124 0.030343 15.89 < 2e-16 ***
## GenHealth -0.502807 0.013947 -36.05 < 2e-16 ***
## SleepTime -0.029368 0.007669 -3.83 0.00013 ***
## AsthmaYes 0.298754 0.034427 8.68 < 2e-16 ***
## KidneyDiseaseYes 0.664202 0.052078 12.75 < 2e-16 ***
## SkinCancerYes 0.156204 0.035617 4.39 1.2e-05 ***
## Age 0.058824 0.000931 63.22 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 60714 on 43795 degrees of freedom
## Residual deviance: 43346 on 43779 degrees of freedom
## AIC: 43380
##
## Number of Fisher Scoring iterations: 5
We’ll quickly check two things for this model. First the p-values. Values below .05 indicates significance, which means the coefficient or so called parameters that are estimated by our model are reliable. And second, the pseudo R square. This value ranging from 0 to 1 indicates how much variance is explained by our model.
All the p-values of the models indicates significance, meaning that our model is a legitimate one. But R square of 0.29 tells that 29 percent of the variance is explained.
list( summary2_glm$coefficient,
round( 1 - ( summary2_glm$deviance / summary2_glm$null.deviance ), 2 ) )
Have a look of the vif value.
vif_md2 = faraway::vif(model2_glm)
vif_md2
## BMI SmokingYes
## 7.11 6.37
## AlcoholDrinkingYes StrokeYes
## 6.52 9.62
## PhysicalHealth MentalHealth
## 10.42 8.05
## DiffWalkingYes SexMale
## 8.70 6.57
## DiabeticNo, borderline diabetes DiabeticYes
## 5.59 7.09
## GenHealth SleepTime
## 11.08 6.67
## AsthmaYes KidneyDiseaseYes
## 6.79 8.44
## SkinCancerYes Age
## 6.40 11.13
We can find that some vif values are larger than 10, which means these variables are highly correlated, not acceptable. So I tried to drop one variable a time. At mean time, look at the p-value to ensure that the variables are significant. At the end, we got the model like below:
model3_glm <- glm( y ~ . - Race - PhysicalActivity - Age - Asthma - PhysicalHealth, data = data_train, family = binomial(logit) )
summary3_glm <- summary(model3_glm)
summary3_glm
##
## Call:
## glm(formula = y ~ . - Race - PhysicalActivity - Age - Asthma -
## PhysicalHealth, family = binomial(logit), data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.370 -0.858 -0.120 0.941 2.297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.21961 0.08471 2.59 0.00953 **
## BMI -0.00625 0.00181 -3.45 0.00055 ***
## SmokingYes 0.48167 0.02257 21.34 < 2e-16 ***
## AlcoholDrinkingYes -0.40284 0.04980 -8.09 6.0e-16 ***
## StrokeYes 1.36603 0.04933 27.69 < 2e-16 ***
## MentalHealth -0.01653 0.00141 -11.74 < 2e-16 ***
## DiffWalkingYes 0.62529 0.03037 20.59 < 2e-16 ***
## SexMale 0.59018 0.02277 25.91 < 2e-16 ***
## DiabeticNo, borderline diabetes 0.42327 0.07167 5.91 3.5e-09 ***
## DiabeticYes 0.71741 0.02907 24.68 < 2e-16 ***
## GenHealth -0.57790 0.01223 -47.24 < 2e-16 ***
## SleepTime 0.03334 0.00728 4.58 4.7e-06 ***
## KidneyDiseaseYes 0.85136 0.05099 16.70 < 2e-16 ***
## SkinCancerYes 0.74038 0.03395 21.81 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 60714 on 43795 degrees of freedom
## Residual deviance: 48335 on 43782 degrees of freedom
## AIC: 48363
##
## Number of Fisher Scoring iterations: 4
vif_md3 = faraway::vif(model3_glm)
vif_md3
## BMI SmokingYes
## 6.03 5.58
## AlcoholDrinkingYes StrokeYes
## 5.68 9.12
## MentalHealth DiffWalkingYes
## 6.39 7.44
## SexMale DiabeticNo, borderline diabetes
## 5.66 5.21
## DiabeticYes GenHealth
## 6.51 8.53
## SleepTime KidneyDiseaseYes
## 6.02 8.09
## SkinCancerYes
## 5.81
Now, the vif values are all below 10.
In this part, I want to use feature selection to find out the suitable variables from our current model. Because the data_train dataset is large and takes a lot of time to run. I changed to data_test to do the feature selection.
data_feature_selection = subset(data_test, select = -c(Race, PhysicalActivity, PhysicalHealth, Age, Asthma))
str(data_feature_selection)
loadPkg("bestglm")
res.bestglm <- bestglm(Xy = data_feature_selection, family = binomial,
IC = "AIC", # Information criteria for
method = "backward")
summary(res.bestglm)
## Fitting algorithm: AIC-glm
## Best Model:
## df deviance
## Null Model 10936 12036
## Full Model 10949 15180
##
## likelihood-ratio test - GLM
##
## data: H0: Null Model vs. H1: Best Fit AIC-glm
## X = 3144, df = 13, p-value <2e-16
res.bestglm$BestModels
## BMI Smoking AlcoholDrinking Stroke MentalHealth DiffWalking Sex Diabetic
## 1 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 2 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## 3 TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## 4 TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## 5 FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## GenHealth SleepTime KidneyDisease SkinCancer Criterion
## 1 TRUE TRUE TRUE TRUE 12062
## 2 TRUE FALSE TRUE TRUE 12064
## 3 TRUE TRUE TRUE TRUE 12067
## 4 TRUE FALSE TRUE TRUE 12070
## 5 TRUE TRUE TRUE TRUE 12070
summary(res.bestglm$BestModels)
## BMI Smoking AlcoholDrinking Stroke MentalHealth
## Mode :logical Mode:logical Mode :logical Mode:logical Mode:logical
## FALSE:1 TRUE:5 FALSE:2 TRUE:5 TRUE:5
## TRUE :4 TRUE :3
##
##
##
## DiffWalking Sex Diabetic GenHealth SleepTime
## Mode:logical Mode:logical Mode:logical Mode:logical Mode :logical
## TRUE:5 TRUE:5 TRUE:5 TRUE:5 FALSE:2
## TRUE :3
##
##
##
## KidneyDisease SkinCancer Criterion
## Mode:logical Mode:logical Min. :12062
## TRUE:5 TRUE:5 1st Qu.:12064
## Median :12067
## Mean :12067
## 3rd Qu.:12070
## Max. :12070
unloadPkg("bestglm")
unloadPkg("leaps")
We can find from the feature selection that the best model has all these 13 variables, and its CIA (Akaike Information Criterion) is 12062, which is the lowest among these models.
Receiver-Operator-Characteristic (ROC) curve and Area-Under-Curve (AUC) measures the true positive rate (or sensitivity) against the false positive rate (or specificity). The area-under-curve is always between 0.5 and 1. Values higher than 0.8 is considered good model fit.
loadPkg("pROC") # receiver operating characteristic curve, gives the diagnostic ability of a binary classifier system as its discrimination threshold is varied. The curve is on sensitivity/recall/true-positive-rate vs false_alarm/false-positive-rate/fall-out.
prob=predict(model3_glm, type = "response" )
data_train$prob=prob
h <- roc(y~prob, data=data_train)
auc(h) # area-under-curve prefer 0.8 or higher.
plot(h)
# unloadPkg("pROC")
The AUC of the model is 0.795, which is a little bit lower than 0.8. Because our model looks suitalbe and we have all the needed features, I suppose that the data causes the AUC value is not high enough.
We can have a look of the Confusion matrix.
# install.packages("regclass")
library("regclass")
# confusion_matrix(admitLogit)
xkabledply( confusion_matrix(model3_glm), title = "Confusion matrix from Logit Model" )
| Predicted No | Predicted Yes | Total | |
|---|---|---|---|
| Actual No | 16646 | 5252 | 21898 |
| Actual Yes | 6957 | 14941 | 21898 |
| Total | 23603 | 20193 | 43796 |
unloadPkg("regclass")
We can find from the confusion Matrix that Precision is 14941/(5252+14941) = 0.74, which means the valid of the result is 0.74. And the recall is 14941/(6957+14941) = 0.682, which means how complete the results are. In our model, actually, I think the recall is more important, because FN means heart disease patients who are missed by our model, which can cause a harmful result.
loadPkg("InformationValue")
predicted = predict(model3_glm, data_test, type = "response")
confusionMatrix(data_test$y, predicted)
## No Yes
## 0 4220 1753
## 1 1255 3722
unloadPkg("InformationValue")
Then we can use the test dataset to checkout whether the model is good to use. So I used the data_test to make a prediction and calculate the confusion Matrix by the test dataset. The Precision is 3722/(1255+3722) = 0.748 and the recall is 3722/(1753+3722) = 0.68. The Precision value and recall value of the test dataset is similiar to the train dataset, which means our model is reliable to predict the heart disease.
We’ll return to our logistic regression model for a minute, and look at the estiamted parameters (coefficients). Since the model’s parameter the recorded in logit format, we’ll transform it into odds ratio so that it’ll be easier to interpret.
loadPkg("broom")
coefficient <- tidy(model3_glm)[ , c( "term", "estimate", "statistic" ) ]
# transfrom the coefficient to be in probability format
coefficient$estimate <- exp( coefficient$estimate )
coefficient[sort(abs(coefficient$estimate),decreasing=T,index.return=T)[[2]],]
## # A tibble: 14 × 3
## term estimate statistic
## <chr> <dbl> <dbl>
## 1 StrokeYes 3.92 27.7
## 2 KidneyDiseaseYes 2.34 16.7
## 3 SkinCancerYes 2.10 21.8
## 4 DiabeticYes 2.05 24.7
## 5 DiffWalkingYes 1.87 20.6
## 6 SexMale 1.80 25.9
## 7 SmokingYes 1.62 21.3
## 8 DiabeticNo, borderline diabetes 1.53 5.91
## 9 (Intercept) 1.25 2.59
## 10 SleepTime 1.03 4.58
## 11 BMI 0.994 -3.45
## 12 MentalHealth 0.984 -11.7
## 13 AlcoholDrinkingYes 0.668 -8.09
## 14 GenHealth 0.561 -47.2
unloadPkg("broom")
We can find from the table that other diseases (stroke, kidney disease, diabetic, SkinCancer), general health conditions, sex, DiffWalking (serious difficulty walking or climbing stairs), and smoking habit all largely influence the possibility of heart disease. It’s a little weird that drinking alcohol will reduce the possibility of heart disease. As we always think, drinking is not a good habit, maybe the data also includes people who drink some little wine.
#Creating new heartdata DF with different name
library(dplyr)
clean_heart<-heartdata
#Assigning ifelse logic to Mental Health. See if the person is not feeling ok in 15 days or more during the month
HighMH = ifelse(clean_heart$MentalHealth>=15, "No", "Yes")
clean_heart = data.frame(clean_heart, HighMH)
#Selecting only the meaningful columns for prediction
clean_heart <- select(clean_heart, HighMH, Smoking, Sex, AlcoholDrinking, PhysicalActivity)
clean_heart <- mutate(clean_heart, HighMH=factor(HighMH), Smoking=factor(Smoking), Sex=factor(Sex), PhysicalActivity=factor(PhysicalActivity))
library('rpart.plot')
create_train_test <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample = c(1: total_row)
if (train == TRUE) {
return (data[train_sample, ])
} else {
return (data[-train_sample, ])
}
}
#Splitting clean_heart into training and testing data
library(caTools)
set.seed(123)
data_train <- create_train_test(clean_heart, 0.8, train = TRUE)
data_test <- create_train_test(clean_heart, 0.8, train = FALSE)
dim(data_train)
dim(data_test)
#sample = sample.split(clean_heart$MentalHealth, SplitRatio = .70)
#train = subset(clean_heart, sample==TRUE)
#test = subset(clean_heart, sample==FALSE)
#Training the Decision Tree Classifier
tree <- rpart(Smoking ~., data=data_train, method="class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.001))
print(tree)
summary(tree)
library(rpart) # Popular decision tree algorithm
library(rattle) # Fancy tree plot
library(rpart.plot) # Enhanced tree plots
library(RColorBrewer) # Color selection for fancy tree plot
library(party) # Alternative decision tree algorithm
library(partykit) # Convert rpart object to BinaryTree
library(caret)
plot(tree, uniform=TRUE, main="Classification Tree for Smoking")
text(tree, use.n=TRUE, all=TRUE, cex=.8)
col <- c("#FD8D3C", "#FD8D3C", "#FD8D3C", "#BCBDDC",
"#FDD0A2", "#FD8D3C", "#BCBDDC")
prp(tree, type=2, extra=104, nn=TRUE, ni=TRUE, fallen.leaves=TRUE,
faclen=0, varlen=0, shadow.col="grey", branch.lty=3)
#we can alsos change extra=104
library("RColorBrewer")
fancyRpartPlot(tree, main="Classification Tree for Smoking", palettes="PuRd", type=2)
#Creating new heartdata DF with different name
library(dplyr)
clean_heart1<-heartdata
clean_heart1$SleepTime=as.numeric(clean_heart1$SleepTime)
clean_heart1$Age=as.numeric(clean_heart1$Age)
#Assigning ifelse logic to Mental Health. See if the person is not feeling ok in 15 days or more during the month
AvSleep = ifelse(clean_heart1$SleepTime>=7, "No", "Yes")
clean_heart1 = data.frame(clean_heart1, AvSleep)
#Assigning ifelse logic to Mental Health. See if the person is not feeling ok in 15 days or more during the month
Age30Plus = ifelse(clean_heart1$Age>=30, "No", "Yes")
clean_heart1 = data.frame(clean_heart1, Age30Plus)
#Selecting only the meaningful columns for prediction
clean_heart1 <- select(clean_heart1, Smoking, Age30Plus, AvSleep, Race, HeartDisease, PhysicalActivity)
clean_heart1 <- mutate(clean_heart1, Race=factor(Race), PhysicalActivity=factor(PhysicalActivity), Smoking=factor(Smoking))
library('rpart.plot')
create_train_test1 <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample = c(1: total_row)
if (train == TRUE) {
return (data[train_sample, ])
} else {
return (data[-train_sample, ])
}
}
#Splitting clean_heart into training and testing data
library(caTools)
set.seed(123)
data_train1 <- create_train_test1(clean_heart1, 0.8, train = TRUE)
data_test1 <- create_train_test1(clean_heart1, 0.8, train = FALSE)
dim(data_train1)
dim(data_test1)
#sample = sample.split(clean_heart$MentalHealth, SplitRatio = .70)
#train = subset(clean_heart, sample==TRUE)
#test = subset(clean_heart, sample==FALSE)
#Training the Decision Tree Classifier
tree1 <- rpart(Smoking ~., data=data_train1, method="class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.001))
print(tree1)
summary(tree1)
library(rpart) # Popular decision tree algorithm
library(rattle) # Fancy tree plot
library(rpart.plot) # Enhanced tree plots
library(RColorBrewer) # Color selection for fancy tree plot
library(party) # Alternative decision tree algorithm
library(partykit) # Convert rpart object to BinaryTree
library(caret)
plot(tree1, uniform=TRUE, main="Classification Tree for Age")
text(tree1, use.n=TRUE, all=TRUE, cex=.8)
col <- c("#FD8D3C", "#FD8D3C", "#FD8D3C", "#BCBDDC",
"#FDD0A2", "#FD8D3C", "#BCBDDC")
prp(tree1, type=2, extra=104, nn=TRUE, ni=TRUE, fallen.leaves=TRUE,
faclen=0, varlen=0, shadow.col="grey", branch.lty=3)
#we can alsos change extra=104
library("RColorBrewer")
fancyRpartPlot(tree1, main="Classification Tree for Smoking", palettes="PuRd", type=2)
#I created set of rules from the decision tree to see the probabilities per rule
rpart.rules(tree1)
#asRules(tree)